# This is an R script for the replication of the first application found in the following paper:
#
#       ===================================================================================================================
#        Pomeroy, C (2018) "The quantitative analysis of space policy: A review of current methods and future directions," 
#          Space Policy.
#       ===================================================================================================================
#
# R can be obtained from: http://www.r-project.org.
# Feel free to reach out with questions. Happy coding.
#
#  -Caleb
#   pomeroy.38@osu.edu
#   Department of Political Science, The Ohio State University


# ------ Load packages ------ #
setwd("YOUR DIRECTORY") # set your working directory to folder that contains the downloaded materials

require(devtools) # Install any of these packages if you don't have them already...
if (!require("readtext")) devtools::install_github("kbenoit/readtext")
library(quanteda) #install_version("quanteda", version = "1.3.4", repos = "http://cran.us.r-project.org") # the analysis was running using this version of quanteda
library(quanteda.dictionaries)
library(twitteR)
library(topicmodels)
library(lubridate)
library(ggplot2)
library(corrplot)
library(plyr)

set.seed(1912) # set seed for replicability




#                    Scrape Twitter 
# ======================================================= #

# The steps in this script are based on the analyses presented in the paper; load "tweets_replication_df.rds" to
# replicate the results in the paper; or, enter your own API access info and search for your own phrases of interest.


# --- Search Twitter for phrase --- #

# --- Personal API info:

api_key <- "INFO" # replace with your info 
api_secret <- "INFO"
access_token <- "INFO"
access_token_secret <- "INFO"
setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)


# --- Access Twitter
tweets <- searchTwitter("\"space exploration\"", n = 10000, lang = "en") # access twitter and locate most recent 10,000 tweets w/ phrase "space exploration"

tweets_df <- do.call("rbind", lapply(tweets, as.data.frame)) # store results as a dataframe with tweets and associated metadata

tweets_df <- readRDS("tweets_replication_df.rds") # alternatively, here's a trimmed down version with only the necessary variables for replication purposes



# --- Preprocess tweets:

tweets_df <- tweets_df[-4485,] # this tweet was flagged downstream as missing so is removed (it only consists of nonsensical punctuation after processing)


tweets_c <- corpus(tweets_df, 
                   text_field = "text") # constructs corpus object to store all goodies

tweets_t <- tokens(tweets_c,
                   remove_numbers = T,
                   remove_punct = T,
                   remove_symbols = T,
                   remove_url = T,
                   remove_twitter = T)

tweets_dfm <- dfm(tweets_t, 
                  tolower = T, 
                  stem = T,
                  remove = stopwords("en"))

tweets_dfm_tw <- dfm_remove(tweets_dfm, c("amp", "rt", "t", "just", "got",
                                          "space", "explor", "https", "can", "h")) # some features that appear in the tweets that aren't helpful




#                       Analyses
# ======================================================= #

# Now that we've scraped Twitter and preprocessed the data, we can explore the data using a topic model and sentiment analysis


# --- Topic model --- #

# Fit your own topic model...
tweets_dtm <- convert(tweets_dfm_tw, to = "topicmodels") # get your data from quanteda to topicmodels

space_lda <- LDA(tweets_dtm, k = 5) # fit a topic model for k=5


#...or load the model presented in the paper
space_lda <- readRDS("space_lda.rds")

terms(space_lda, 5) # check out the top 5 terms for each topic

docvars(tweets_c, "topic") <- topics(space_lda) # add as a variable to our corpus

tweets_df$topic <- topics(space_lda) # as well as back to the original dataframe




# --- Sentiment Analysis --- #

output_space <- liwcalike(tweets_c, dictionary = data_dictionary_NRC) # score the tweets

output_space$sentiment <- output_space$positive - output_space$negative # calculate the overall sentiment

docvars(tweets_c, "sentiment") <- output_space$sentiment # can add the sentiment of the tweet back to the corpus as a variable

tweets_df$sentiment <- output_space$sentiment # as well as back to the original dataframe

tweets_df$day <- day(floor_date(ymd_hms(tweets_df$created), unit = "day")) # break tweets up by day

plot_data <- data.frame(topic = tweets_df$topic, # store it in a dataframe for plotting 
                        retweets = tweets_df$retweetCount,
                        sentiment = tweets_df$sentiment,
                        Date = tweets_df$day)


# --- Correlation between sentiment and retweets

cor.test(tweets_df$retweetCount, tweets_df$sentiment) # ...anddd it's negative.





# --- Plotting --- #

# Some descriptive statistics for plotting

list_tops <- list()
for (i in min(plot_data$Date):max(plot_data$Date)){
  
  df_tops <- data.frame()
  
  for(j in 1:5){
    
    sub <- subset(plot_data, Date == i)
    
    sub_top <- subset(sub, topic == j)
    
    n_tweets_top_i <- dim(sub_top)[1] # number of tweets per topic per day
    
    n_retweets_top_i <- sum(sub_top$retweets) # number of retweets per topic per day
    
    mean_sentiment_top_i <- mean(sub_top$sentiment) # mean of sentiment per topic per day
    
    df <- data.frame(n_tweets_top_i = n_tweets_top_i,
                     n_retweets_top_i = n_retweets_top_i,
                     mean_sentiment_top_i = mean_sentiment_top_i,
                     topic = j)
    
    df_tops <- rbind(df_tops, df)
    
    list_tops[[i]] <- df_tops
    
  }
}


tops_plot <- rbind.fill(list_tops) # cast the list back into a single dataframe

tops_plot$Date <- rep(min(plot_data$Date):max(plot_data$Date), each=length(unique(plot_data$topic))) # a date column

tops_plot$topic <- as.factor(tops_plot$topic) # tell R to recognize the topics as factors






# --- Frequency of tweets plot:

p <- ggplot(tops_plot, aes(x=Date, y=n_tweets_top_i, group=topic)) +
  geom_point(color="gray20", shape=1, size=2) +
  geom_line(aes(color=topic, linetype=topic),size=1) +
  ylab("# Tweets") +
  scale_color_manual(values = c("gray50", "gray30","steelblue4", "azure3", "indianred3")) +
  scale_x_time(breaks=c("8","10","12","14"), labels = c("Jun-8", "Jun-10", "Jun-12", "Jun-14")) +
  theme_light()

p + guides(color=guide_legend(title="Topic"), 
           group = guide_legend(title="Topic"),
           linetype = guide_legend(title="Topic")) +
           theme(text = element_text(size=16), legend.justification = "center")





# --- Frequency of retweets plot:


p2 <- ggplot(tops_plot, aes(x=Date, y=n_retweets_top_i, group=topic)) +
  geom_point(color="gray20", shape=1, size=2) +
  geom_line(aes(color=topic, linetype=topic),size=1) +
  ylab("# Retweets (log)") +
  scale_color_manual(values = c("gray50", "gray30","steelblue4", "azure3", "indianred3")) +
  scale_x_time(breaks=c("8","10","12","14"), labels = c("Jun-8", "Jun-10", "Jun-12", "Jun-14")) +
  theme_light() +
  scale_y_log10()

p2 + guides(color=guide_legend(title="Topic"), 
            group = guide_legend(title="Topic"),
            linetype = guide_legend(title="Topic")) +
            theme(text = element_text(size=16), legend.justification = "center")




# --- Sentiment plot:

sent_df <- data.frame()
sent_df <- rbind(sent_df, 
                 subset(tops_plot$mean_sentiment_top_i, tops_plot$topic==1),
                 subset(tops_plot$mean_sentiment_top_i, tops_plot$topic==2),
                 subset(tops_plot$mean_sentiment_top_i, tops_plot$topic==3),
                 subset(tops_plot$mean_sentiment_top_i, tops_plot$topic==4),
                 subset(tops_plot$mean_sentiment_top_i, tops_plot$topic==5))

rownames(sent_df) <- c("Topic 1", "Topic 2", "Topic 3", "Topic 4", "Topic 5") 
colnames(sent_df) <- c("Jun-7", "Jun-8", "Jun-9", "Jun-10", "Jun-11", "Jun-12", "Jun-13", "Jun-14", "Jun-15")

col <- colorRampPalette(c("indianred3", "white", "steelblue4"))

corrplot(as.matrix(sent_df),
         "circle",cl.pos="r", tl.col="black", tl.cex=.9,mar=c(.1,.1,.1,.1), tl.offset = .8,
         diag = T, outline = T,cl.lim = c(-12,12), is.corr = F, addgrid.col = "gray84",
         tl.srt = 45, cl.length = 3,
         col=col(100)) 




# --------------------- THE END. --------------------- #